Highlighting collaborations in SNSF grants running in 2017.
Data
p3url <- c(
"http://p3.snf.ch/P3Export/P3_GrantExport.csv",
"http://p3.snf.ch/P3Export/P3_PersonExport.csv")
# "http://p3.snf.ch/P3Export/P3_CollaborationExport.csv")
purrr::walk2(p3url, saveas, download.file, quiet = TRUE)You can download up-to-date SNSF grant data here. Before reading the datasets in R, however, I recommend using MS Excel (or Libre/Open Office) to fix csv parsing problems.
grants.csv and people.csv correspond to P3_GrantExport.csv and P3_PersonExport.csv, respectively. I made sure they read faultlessly and cleaned their headers with the clean_names() function from the janitor package.
Grants connecting Switzerland
We want to highlight 2017 grants connecting institutes from different parts of Switzerland and of the world.
p_load("lubridate", "stringr")
grants <- fread("grants.csv")
people <- fread("people.csv")
# focus on a time range
trange <- ymd(c(str_c(YEAR, "-01-01"), str_c(YEAR, "-12-31")))
# find grants running in that time range
grants <- grants %>%
select(
project_number, discipline_number, start_date, end_date) %>%
mutate(
start_date = ymd(str_sub(start_date, 1, 10)),
end_date = ymd(str_sub(end_date, 1, 10)),
domain = as.integer(str_extract_all(discipline_number, "^[0-9]"))) %>%
filter(start_date <= trange[2], end_date >= trange[1])
head(grants) project_number discipline_number start_date end_date domain
1 128565 10105 2011-03-01 2018-02-28 1
2 129572 10105 2011-01-01 2017-02-28 1
3 133815 20404 2012-04-01 2017-03-31 2
4 135192 30720 2012-07-01 2018-04-30 3
5 135970 10104 2014-01-01 2017-12-31 1
6 136707 20507 2012-02-01 2017-05-31 2
# find the people involved in those grants
people <- people %>%
select(person_id_snsf, institute_place, starts_with("projects")) %>%
filter(institute_place != "") %>%
select(-projects_as_responsible_applicant) %>%
unite(project_number, starts_with("projects"), sep = ";") %>%
mutate(
institute_place = str_replace_all(institute_place, " Cedex(.*)?| [0-9]{1,2}", ""),
project_number = str_replace_all(project_number, "[;]+", ";"),
project_number = str_replace_all(project_number, "^[;]|[;]$|NA", ""),
project_number = str_split(project_number, ";")) %>%
unnest() %>%
mutate(project_number = as.integer(project_number)) %>%
semi_join(grants, by = "project_number")
head(people) person_id_snsf institute_place project_number
1 587432 Stavanger 171200
2 702008 Fribourg 175960
3 687469 Basel 170809
4 642410 Lausanne 153952
5 642410 Lausanne 153990
6 54956 Tirana 152346
The collaboration network
To show the network in a map, we focus on grants connecting people in different places.
# how many distinct places per grant?
places_per_grant <- group_by(people, project_number) %>%
summarise(n_places = n_distinct(institute_place))
# network core: projects involving multiple places (n_places > 1)
core <- people %>%
select(-person_id_snsf) %>%
semi_join(filter(places_per_grant, n_places > 1), by = "project_number") %>%
arrange(project_number) %>%
distinct()
head(core) institute_place project_number
1 Konstanz 128565
2 Bern 128565
3 Berlin 129572
4 Basel 129572
5 St. Gallen 135970
6 Rostock 135970
We also need the geocodes of all the places of the network. You find more information on how to get geocode data in R here and here. In essence, I use two APIs: openstreetmaps and googe maps as fallback. To save the effort of writing html requests, I use dedicated R packages to query map data. To speed up the analysis, we will store the geocodes in geocodes.csv.
# initialize geocodes as a tibble
geocodes <- tibble(
place = sort(unique(core$institute_place)),
lat = rep(NA_real_, n_distinct(core$institute_place)),
lon = rep(NA_real_, n_distinct(core$institute_place)),
addr = rep(NA_character_, n_distinct(core$institute_place)),
id = rep(NA_character_, n_distinct(core$institute_place)))
p_load_gh("hrbrmstr/nominatim", "ggmap") # ggmap requires `libpng16-dev` in ubuntu
osm_key <- readLines("mapquest.key")
# openstreetmap api
osm <- function(query, osm_key) {
r <- osm_search_spatial(query, limit = 1, key = osm_key)
if (!is.null(r[[1]])) {
c(r[[1]]$place_id, r[[1]]$display_name, r[[1]]$lat, r[[1]]$lon)
} else return(NA)
}
# googlemaps api (2500 reqs/day, 50 reqs/sec max)
google <- function(query) {
r <- geocode(query, output = "all")
if (r$status == "OK") {
c(r$results[[1]]$place_id, r$results[[1]]$formatted_address,
r$results[[1]]$geometry$location$lat, r$results[[1]]$geometry$location$lng)
} else return(NA)
}
for (k in seq_along(geocodes$place)) {
cat(" ........ ", k, ": ", geocodes$place[k], "\n")
info <- osm(geocodes$place[k], osm_key)
cat(" osm info: ", info, "\n")
# fallback
if (is.na(info)) {
info <- google(geocodes$place[k])
cat(" google: ", info, "\n")
}
# store info in meta
if (!is.na(info)) {
geocodes$id[k] = info[1]
geocodes$addr[k] = info[2]
geocodes$lat[k] = as.numeric(info[3])
geocodes$lon[k] = as.numeric(info[4])
}
}
# it works despite of some warnings...
write_csv(geocodes, path = "geocodes.csv")geocodes <- fread("geocodes.csv") %>% drop_na()
head(geocodes) place lat lon
1: 69120 Heidelberg 49.416449 8.688807
2: Aarau 47.392715 8.044445
3: Aarhus C 56.149628 10.213405
4: Aberdeen 57.145245 -2.091375
5: Abidjan 5.409118 -4.042210
6: Abomey-Calavi 6.415369 2.306228
addr
1: Neuenheim, Heidelberg, Regierungsbezirk Karlsruhe, Baden-Württemberg, 69120, Germany
2: Aarau, Bezirk Aarau, Aargau, 5000, Switzerland
3: Aarhus, Aarhus Municipality, Central Denmark Region, Denmark
4: Aberdeen, Aberdeenshire, Scotland, United Kingdom
5: Abidjan, Côte d'Ivoire
6: Abomey-Calavi, Atlantique, Benin
id
1: 151274321
2: 151329375
3: 133615
4: 113817
5: 151537515
6: 151484207
# add geocode data to the network core
core <- left_join(core, geocodes, by = c("institute_place" = "place"))Swiss collaboration network
Lets show the Swiss network first.
The visualization is heavily inspired from here and/or here.
core_ch <- filter(core, str_detect(addr, "Switzerland$"))
# find projects on more than one place in Switzerland
ch_per_grant <- group_by(core_ch, project_number) %>%
summarise(n_places = n_distinct(institute_place))
# swiss core: projects involving multiple places in Switzerland
core_ch <- semi_join(core_ch, filter(ch_per_grant, n_places > 1), by = "project_number")
# nodes (places) ----------------------------------------------------------
nodes_ch <- group_by(core_ch, institute_place) %>%
summarise(y = head(lat, 1), x = head(lon, 1), size = n()) %>%
arrange(desc(size))
head(nodes_ch)
write_csv(nodes_ch, path = "nodes_ch.csv")
p_load("magrittr")
project_edges <- function(number, df) {
links <- filter(df, project_number == number) %>%
select(institute_place) %$%
combn(sort(institute_place), m = 2)
tibble(from = links[1, ],
to = links[2,]) %>%
mutate(number = number) %>%
select(number, everything())
}
# edges (grants) ----------------------------------------------------------
edges_ch <- purrr::map(unique(core_ch$project_number), project_edges, df = core_ch)
edges_ch <- do.call(bind_rows, edges_ch)
# add geocode metadata and group
edges_ch <- edges_ch %>%
left_join(select(nodes_ch, -size), by = c("from" = "institute_place")) %>%
rename(x1 = x, y1 = y) %>%
left_join(select(nodes_ch, -size), by = c("to" = "institute_place")) %>%
rename(x2 = x, y2 = y) %>%
group_by(from, to) %>%
summarise(
x1 = head(x1, 1),
y1 = head(y1, 1),
x2 = head(x2, 1),
y2 = head(y2, 1),
strength = n())
head(edges_ch)
write_csv(edges_ch, path = "edges_ch.csv")# plot the nodes and the edges on the map of Switzerland
p_load("maps", "mapdata", "geosphere")
nodes_ch <- fread("nodes_ch.csv")
edges_ch <- fread("edges_ch.csv")
col_dark <- adjustcolor("#252525", alpha = 0.05)
col_light <- adjustcolor("#08306b", alpha = 0.4)
edge_pal <- colorRampPalette(c(col_dark, col_light), alpha = TRUE)
edge_col <- edge_pal(100)
svg("core_ch.svg", width = 12, height = 7)
par(mar = c(0, 0, 0, 0))
maps::map(database = "worldHires", regions = "Switzerland",
fill = FALSE, col = rgb(0, 0, 0, .2))
# map nodes
points(x = nodes_ch$x, y = nodes_ch$y, pch = 16,
cex = log(nodes_ch$size)/3, col = rgb(0, 0, 0, .2))
# map edges
for (k in 1:nrow(edges_ch)) {
arc <- gcIntermediate(
c(edges_ch$x1[k], edges_ch$y1[k]),
c(edges_ch$x2[k], edges_ch$y2[k]),
n = 100, addStartEnd = TRUE)
strength <- round(100 * sqrt(edges_ch$strength[k]) / max(sqrt(edges_ch$strength)))
lines(arc, col = edge_col[strength], lwd = strength / 10)
}
# label main cities
cities <- c("Zürich", "Lausanne", "Bern", "Genève",
"Basel", "Fribourg", "Neuchâtel", "St. Gallen",
"Lugano", "Luzern", "Winterthur")
city_nodes <- filter(nodes_ch, institute_place %in% cities)
graphics::text(
city_nodes$x, city_nodes$y,
labels = city_nodes$institute_place, pos = 3, cex = .7, col = rgb(0, 0, 0, .66))
invisible(dev.off())switzerland
International collaboration network
Same as above, but for the rest of the world.
nodes_io <- group_by(core, institute_place) %>%
summarise(y = head(lat, 1), x = head(lon, 1), size = n()) %>%
arrange(desc(size))
write_csv(nodes_io, path = "nodes_io.csv")
edges_io <- purrr::map(unique(core$project_number), project_edges, df = core)
edges_io <- do.call(bind_rows, edges_io)
# add geocode metadata and group
edges_io <- edges_io %>%
left_join(select(nodes_io, -size), by = c("from" = "institute_place")) %>%
rename(x1 = x, y1 = y) %>%
left_join(select(nodes_io, -size), by = c("to" = "institute_place")) %>%
rename(x2 = x, y2 = y) %>%
group_by(from, to) %>%
summarise(
x1 = head(x1, 1),
y1 = head(y1, 1),
x2 = head(x2, 1),
y2 = head(y2, 1),
strength = n())
write_csv(edges_io, path = "edges_io.csv")# plot the nodes and the edges on the world ma
nodes_io <- fread("nodes_io.csv")
edges_io <- fread("edges_io.csv")
nodes_io <- anti_join(nodes_io, nodes_ch, by = "institute_place")
edges_io <- edges_io %>%
anti_join(edges_ch, by = c("from" = "from", "to" = "to")) %>%
drop_na()
col_dark <- adjustcolor("#252525", alpha = 0.01)
col_light <- adjustcolor("#08306b", alpha = 0.2)
edge_pal <- colorRampPalette(c(col_dark, col_light), alpha = TRUE)
edge_col <- edge_pal(100)
svg("core.svg", width = 12, height = 7)
par(mar = c(0, 0, 0, 0))
maps::map("world", fill = FALSE, col = rgb(0, 0, 0, .2))
# map nodes
points(x = nodes_io$x, y = nodes_io$y, pch = 16,
cex = log(nodes_io$size)/3, col = rgb(0, 0, 0, .2))
# map edges
for (k in 1:nrow(edges_io)) {
arc <- gcIntermediate(
c(edges_io$x1[k], edges_io$y1[k]),
c(edges_io$x2[k], edges_io$y2[k]),
n = 100, addStartEnd = TRUE)
strength <- round(100 * sqrt(edges_io$strength[k]) / max(sqrt(edges_io$strength)))
lines(arc, col = edge_col[strength], lwd = strength / 50)
}
invisible(dev.off())world
About
This page uses the html_cleam template from the prettydoc package.